Load needed libraries
library(fastDummies)
library(readr)
library(ggplot2)
library(dplyr)
library(caret)
library(glmnet)
library(boot)
library(tree)
library(ranger)
library(xgboost)
library(gbm)
library(vip)
library(ISLR)
library(tidyr)
Set the seed for reproducibility
set.seed(1)
Load the dataset
original_lc_data <- read.csv("LCdata.csv",sep = ";")
lc_data <- original_lc_data
remove attributes not available for prediction
lc_data <- subset(lc_data, select = -c(collection_recovery_fee, installment, issue_d,
last_pymnt_amnt, last_pymnt_d, loan_status,
next_pymnt_d, out_prncp, out_prncp_inv,
pymnt_plan, recoveries, total_pymnt,
total_pymnt_inv,total_rec_int, total_rec_late_fee, total_rec_prncp))
summary(lc_data)
id member_id loan_amnt funded_amnt
Min. : 54734 Min. : 70473 Min. : 500 Min. : 500
1st Qu.: 9207230 1st Qu.:10877939 1st Qu.: 8000 1st Qu.: 8000
Median :34433372 Median :37095300 Median :13000 Median :13000
Mean :32463636 Mean :35000265 Mean :14754 Mean :14741
3rd Qu.:54900100 3rd Qu.:58470266 3rd Qu.:20000 3rd Qu.:20000
Max. :68617057 Max. :73544841 Max. :35000 Max. :35000
funded_amnt_inv term int_rate emp_title
Min. : 0 Length:798641 Min. : 5.32 Length:798641
1st Qu.: 8000 Class :character 1st Qu.: 9.99 Class :character
Median :13000 Mode :character Median :12.99 Mode :character
Mean :14702 Mean :13.24
3rd Qu.:20000 3rd Qu.:16.20
Max. :35000 Max. :28.99
emp_length home_ownership annual_inc verification_status
Length:798641 Length:798641 Min. : 0 Length:798641
Class :character Class :character 1st Qu.: 45000 Class :character
Mode :character Mode :character Median : 65000 Mode :character
Mean : 75014
3rd Qu.: 90000
Max. :9500000
NA's :4
url desc purpose title
Length:798641 Length:798641 Length:798641 Length:798641
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
zip_code addr_state dti delinq_2yrs
Length:798641 Length:798641 Min. : 0.00 Min. : 0.0000
Class :character Class :character 1st Qu.: 11.91 1st Qu.: 0.0000
Mode :character Mode :character Median : 17.66 Median : 0.0000
Mean : 18.16 Mean : 0.3145
3rd Qu.: 23.95 3rd Qu.: 0.0000
Max. :9999.00 Max. :39.0000
NA's :25
earliest_cr_line inq_last_6mths mths_since_last_delinq
Length:798641 Min. : 0.0000 Min. : 0.0
Class :character 1st Qu.: 0.0000 1st Qu.: 15.0
Mode :character Median : 0.0000 Median : 31.0
Mean : 0.6947 Mean : 34.1
3rd Qu.: 1.0000 3rd Qu.: 50.0
Max. :33.0000 Max. :188.0
NA's :25 NA's :408818
mths_since_last_record open_acc pub_rec revol_bal
Min. : 0.0 Min. : 0.00 Min. : 0.0000 Min. : 0
1st Qu.: 51.0 1st Qu.: 8.00 1st Qu.: 0.0000 1st Qu.: 6443
Median : 70.0 Median :11.00 Median : 0.0000 Median : 11876
Mean : 70.1 Mean :11.55 Mean : 0.1953 Mean : 16930
3rd Qu.: 92.0 3rd Qu.:14.00 3rd Qu.: 0.0000 3rd Qu.: 20839
Max. :129.0 Max. :90.00 Max. :63.0000 Max. :2904836
NA's :675190 NA's :25 NA's :25 NA's :2
revol_util total_acc initial_list_status last_credit_pull_d
Min. : 0.00 Min. : 1.00 Length:798641 Length:798641
1st Qu.: 37.70 1st Qu.: 17.00 Class :character Class :character
Median : 56.00 Median : 24.00 Mode :character Mode :character
Mean : 55.05 Mean : 25.27
3rd Qu.: 73.50 3rd Qu.: 32.00
Max. :892.30 Max. :169.00
NA's :454 NA's :25
collections_12_mths_ex_med mths_since_last_major_derog policy_code
Min. : 0.00000 Min. : 0.0 Min. :1
1st Qu.: 0.00000 1st Qu.: 27.0 1st Qu.:1
Median : 0.00000 Median : 44.0 Median :1
Mean : 0.01447 Mean : 44.1 Mean :1
3rd Qu.: 0.00000 3rd Qu.: 61.0 3rd Qu.:1
Max. :20.00000 Max. :188.0 Max. :1
NA's :126 NA's :599107
application_type annual_inc_joint dti_joint
Length:798641 Min. : 17950 Min. : 3.0
Class :character 1st Qu.: 76167 1st Qu.:13.3
Mode :character Median :101886 Median :17.7
Mean :110745 Mean :18.4
3rd Qu.:133000 3rd Qu.:22.6
Max. :500000 Max. :43.9
NA's :798181 NA's :798183
verification_status_joint acc_now_delinq tot_coll_amt
Length:798641 Min. : 0.000000 Min. : 0
Class :character 1st Qu.: 0.000000 1st Qu.: 0
Mode :character Median : 0.000000 Median : 0
Mean : 0.005026 Mean : 228
3rd Qu.: 0.000000 3rd Qu.: 0
Max. :14.000000 Max. :9152545
NA's :25 NA's :63276
tot_cur_bal open_acc_6m open_il_6m open_il_12m
Min. : 0 Min. : 0.0 Min. : 0.0 Min. : 0.0
1st Qu.: 29861 1st Qu.: 0.0 1st Qu.: 1.0 1st Qu.: 0.0
Median : 80647 Median : 1.0 Median : 2.0 Median : 0.0
Mean : 139508 Mean : 1.1 Mean : 2.9 Mean : 0.8
3rd Qu.: 208229 3rd Qu.: 2.0 3rd Qu.: 4.0 3rd Qu.: 1.0
Max. :8000078 Max. :14.0 Max. :33.0 Max. :12.0
NA's :63276 NA's :779525 NA's :779525 NA's :779525
open_il_24m mths_since_rcnt_il total_bal_il il_util
Min. : 0.0 Min. : 0.0 Min. : 0 Min. : 0.0
1st Qu.: 0.0 1st Qu.: 6.0 1st Qu.: 10164 1st Qu.: 58.4
Median : 1.0 Median : 12.0 Median : 24544 Median : 74.8
Mean : 1.7 Mean : 21.1 Mean : 36428 Mean : 71.5
3rd Qu.: 2.0 3rd Qu.: 23.0 3rd Qu.: 47640 3rd Qu.: 87.7
Max. :19.0 Max. :363.0 Max. :878459 Max. :223.3
NA's :779525 NA's :780030 NA's :779525 NA's :782007
open_rv_12m open_rv_24m max_bal_bc all_util
Min. : 0.0 Min. : 0 Min. : 0 Min. : 0.0
1st Qu.: 0.0 1st Qu.: 1 1st Qu.: 2406 1st Qu.: 47.6
Median : 1.0 Median : 2 Median : 4502 Median : 61.9
Mean : 1.4 Mean : 3 Mean : 5878 Mean : 60.8
3rd Qu.: 2.0 3rd Qu.: 4 3rd Qu.: 7774 3rd Qu.: 75.2
Max. :22.0 Max. :43 Max. :83047 Max. :151.4
NA's :779525 NA's :779525 NA's :779525 NA's :779525
total_rev_hi_lim inq_fi total_cu_tl inq_last_12m
Min. : 0 Min. : 0.0 Min. : 0.0 Min. :-4
1st Qu.: 13900 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0
Median : 23700 Median : 0.0 Median : 0.0 Median : 2
Mean : 32093 Mean : 0.9 Mean : 1.5 Mean : 2
3rd Qu.: 39800 3rd Qu.: 1.0 3rd Qu.: 2.0 3rd Qu.: 3
Max. :9999999 Max. :16.0 Max. :35.0 Max. :32
NA's :63276 NA's :779525 NA's :779525 NA's :779525
First we delete the columns which aren’t useful for our prediction
lc_data$id <- NULL
lc_data$member_id <- NULL
lc_data$zip_code <- NULL
lc_data$url <- NULL
Looks like policy_code contains just value equal to 1, it can be removed
lc_data$policy_code <- NULL
Remove additional columns which are related to the historical data
lc_data$last_credit_pull_d <- NULL
Then we delete the columns which can’t be converted to categorical and require NLP
lc_data$title <- NULL
lc_data$desc <- NULL
lc_data$emp_title <- NULL
let’s examine the loan_amnt column
sum(is.na(lc_data$loan_amnt))
[1] 0
cor(lc_data$loan_amnt, lc_data$int_rate)
[1] 0.1447189
hist(lc_data$loan_amnt, breaks = 20, main = "loan_amnt distribution", xlab = "loan_amnt", col = "lightblue", border = "black")
ggplot(data = lc_data, mapping = aes(x=int_rate,y=loan_amnt)) + geom_boxplot()
standardize loan_amnt
#lc_data$loan_amnt <- scale(lc_data$loan_amnt)
let’s examine the funded_amnt column
sum(is.na(lc_data$funded_amnt))
[1] 0
cor(lc_data$funded_amnt, lc_data$int_rate)
[1] 0.1448634
hist(lc_data$funded_amnt, breaks = 20, main = "funded_amnt distribution", xlab = "funded_amnt", col = "lightblue", border = "black")
as we can see, funded_amnt is almost the same as the loan_amnt column, consequently, we remove it.
lc_data$funded_amnt <- NULL
let’s examine the funded_amnt_inv column
sum(is.na(lc_data$funded_amnt_inv))
[1] 0
cor(lc_data$funded_amnt_inv, lc_data$int_rate)
[1] 0.1449083
hist(lc_data$funded_amnt_inv, breaks = 20, main = "funded_amnt_inv distribution", xlab = "funded_amnt_inv", col = "lightblue", border = "black")
remove funded_amnt_inv for the same reason as above
lc_data$funded_amnt_inv <- NULL
let’s see the int_rate distribution.
hist(lc_data$int_rate, breaks = 20, main = "int_rate distribution", xlab = "int_rate", col = "lightblue", border = "black")
Standardize int rate:
#lc_data$int_rate <- scale(lc_data$int_rate)
we delete the emp_title column as there are several entries for the same job title and because there are too many different values for one-hot encoding. In addition, some titles are unclear (NLP required)
n_distinct(lc_data$emp_title)
[1] 0
As we can observe, there are 40363 NAs. We can assume 40363 do not work.
barplot(table(lc_data$emp_length),
xlab = "emp_length years",
ylab = "Frequency",
col = "skyblue",
border = "black",
cex.names = 0.6) # The size of the main title
Since emp_length seems to be categorical, we transform it to as a factor and then as numeric. The conversion to numeric is needed for supporting the XGBoost
lc_data$emp_length <- as.factor(lc_data$emp_length)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=emp_length)) + geom_boxplot()
lc_data$emp_length <- as.numeric(lc_data$emp_length)
term
lc_data$term <- as.factor(lc_data$term)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=term)) + geom_boxplot()
lc_data$term <- as.numeric(lc_data$term)
boh = order(lc_data$term)
Cleaning of home_ownership:
During the data cleaning phase, our analysis revealed that the variable “home_ownership” does not show a distinct correlation with interest rates. Specifically, among the categories, “ANY” and “OTHER” contain 2 and 154 cases, respectively, while the “NONE” category comprises 39 cases. Although the “NONE” category appears to demonstrate a higher interest rate compared to others, the limited sample size of 39 cases raises doubts about the reliability of this observation. Notably, the “NONE” category might pertain to individuals experiencing homelessness, prompting ethical concerns about loan provision to this demographic.
table(lc_data$home_ownership)
ANY MORTGAGE NONE OTHER OWN RENT
2 399151 45 155 78789 320499
ggplot(data = lc_data, mapping = aes(x=int_rate,y=home_ownership)) + geom_boxplot()
Then, we retain mortgage, own and rent:
lc_data <- lc_data %>% filter(home_ownership %in% c("MORTGAGE","OWN","RENT"))
lc_data$home_ownership <- as.numeric(as.factor(lc_data$home_ownership))
# merging annual income
lc_data <- lc_data %>% mutate(
annual_inc_merged = ifelse(is.na(annual_inc_joint)== TRUE, annual_inc,annual_inc_joint))
lc_data <- lc_data %>% select(-annual_inc,-annual_inc_joint)
# merging debt to income ratio
lc_data <- lc_data %>% mutate(
dti_merged = ifelse(is.na(dti_joint)== TRUE, dti,dti_joint))
lc_data <- lc_data %>% select(-dti,-dti_joint)
Upon reviewing the summary again, it becomes apparent that there are merely 460 joint applications, constituting a small subset within the extensive dataset of around 800k rows. Through consolidating the debt-to-income ratios (dti’s), we can pinpoint the data pertinent to our research objectives. Hence, it is advisable to eliminate the columns verification_status_joint and application_type to prevent introducing unwarranted variability into our analysis.
table(lc_data$verification_status)
Not Verified Source Verified Verified
240255 296631 261553
table(lc_data$verification_status_joint)
Not Verified Source Verified Verified
797979 253 53 154
lc_data$verification_status <- as.numeric(as.factor(lc_data$verification_status))
lc_data <- lc_data %>% select(-verification_status_joint, -application_type)
Let’s checl if other is NA or a real value for purpose. It’s a real one, so we don’t have to handle it.
lc_data$purpose <- as.factor(lc_data$purpose)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=purpose)) + geom_boxplot()
lc_data$purpose <- as.numeric(lc_data$purpose)
Let’s have a glance to the state address:
table(lc_data$addr_state)
AK AL AR AZ CA CO CT DC DE FL GA
1992 10101 5953 18359 116578 16934 12154 2188 2268 54819 26146
HI IA ID IL IN KS KY LA MA MD ME
4112 13 11 31880 12393 7105 7726 9498 18546 18906 469
MI MN MO MS MT NC ND NE NH NJ NM
20678 14306 12821 3455 2286 22135 431 1064 3865 29991 4428
NV NY OH OK OR PA RI SC SD TN TX
11155 66790 26682 7266 9806 28221 3499 9609 1615 11618 63982
UT VA VT WA WI WV WY
5629 23616 1606 17470 10446 3977 1841
lc_data$addr_state <- as.factor(lc_data$addr_state)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=addr_state)) + geom_boxplot()
lc_data$addr_state <- as.numeric(lc_data$addr_state)
Regarding delinquency in the last 2 years, there are few NAs then remove them:
lc_data <- lc_data %>%
filter(!(is.na(delinq_2yrs)))
lc_data <- lc_data %>%
mutate(mths_since_delinq_cat = ifelse(
is.na(mths_since_last_delinq) == TRUE,
"NONE",
ifelse(
mths_since_last_delinq <= 12,
"Less_1_Y",
ifelse(
mths_since_last_delinq <= 24,
"Less_2_Y",
ifelse(
mths_since_last_delinq <= 36,
"Less_3_Y",
ifelse(mths_since_last_delinq <= 48, "Less_4_Y", "More_4_Y")
)
)
)
)) %>% select(-mths_since_last_delinq)
lc_data$mths_since_delinq_cat <- as.factor(lc_data$mths_since_delinq_cat)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=mths_since_delinq_cat))+geom_boxplot()
lc_data$mths_since_delinq_cat <- as.numeric(lc_data$mths_since_delinq_cat)
lc_data <- lc_data %>%
mutate(mths_since_last_record_cat = ifelse(
is.na(mths_since_last_record) == TRUE,
"NONE",
ifelse(
mths_since_last_record <= 12,
"Less_1_Y",
ifelse(
mths_since_last_record <= 24,
"Less_2_Y",
ifelse(
mths_since_last_record <= 36,
"Less_3_Y",
ifelse(mths_since_last_record <= 48, "Less_4_Y", "More_4_Y")
)
)
)
)) %>% select(-mths_since_last_record)
lc_data$mths_since_last_record_cat <- as.factor(lc_data$mths_since_last_record_cat)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=mths_since_last_record_cat))+geom_boxplot()
lc_data$mths_since_last_record_cat <- as.numeric(lc_data$mths_since_last_record_cat)
lc_data <-lc_data %>%
mutate(mths_since_last_major_derog_cat = ifelse(
is.na(mths_since_last_major_derog) == TRUE,
"NONE",
ifelse(
mths_since_last_major_derog <= 12,
"Less_1_Y",
ifelse(
mths_since_last_major_derog <= 24,
"Less_2_Y",
ifelse(
mths_since_last_major_derog <= 36,
"Less_3_Y",
ifelse(mths_since_last_major_derog <= 48, "Less_4_Y", "More_4_Y")
)
)
)
)) %>% select(-mths_since_last_major_derog)
lc_data$mths_since_last_major_derog_cat <- as.factor(lc_data$mths_since_last_major_derog_cat)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=mths_since_last_major_derog_cat))+geom_boxplot()
lc_data$mths_since_last_major_derog_cat <- as.numeric(lc_data$mths_since_last_major_derog_cat)
lc_data$initial_list_status <- as.factor(lc_data$initial_list_status)
ggplot(data = lc_data, mapping = aes(x=int_rate,y=initial_list_status))+geom_boxplot()
lc_data$initial_list_status <- as.numeric(lc_data$initial_list_status)
Let’s check which columns still have null values
colSums(is.na(lc_data))
loan_amnt term
0 0
int_rate emp_length
0 0
home_ownership verification_status
0 0
purpose addr_state
0 0
delinq_2yrs earliest_cr_line
0 0
inq_last_6mths open_acc
0 0
pub_rec revol_bal
0 2
revol_util total_acc
428 0
initial_list_status collections_12_mths_ex_med
0 99
acc_now_delinq tot_coll_amt
0 63132
tot_cur_bal open_acc_6m
63132 779302
open_il_6m open_il_12m
779302 779302
open_il_24m mths_since_rcnt_il
779302 779807
total_bal_il il_util
779302 781784
open_rv_12m open_rv_24m
779302 779302
max_bal_bc all_util
779302 779302
total_rev_hi_lim inq_fi
63132 779302
total_cu_tl inq_last_12m
779302 779302
annual_inc_merged dti_merged
0 0
mths_since_delinq_cat mths_since_last_record_cat
0 0
mths_since_last_major_derog_cat
0
The columns revol_bal and revol_util contain only few NA values, those values can’t be replaced with 0, then we filter the values which are not NA
lc_data <- lc_data %>%
filter(!(is.na(revol_bal))) %>%
filter(!(is.na(revol_util)))
Let’s check which columns still have null values
names(which(colSums(is.na(lc_data)) > 0))
[1] "collections_12_mths_ex_med" "tot_coll_amt"
[3] "tot_cur_bal" "open_acc_6m"
[5] "open_il_6m" "open_il_12m"
[7] "open_il_24m" "mths_since_rcnt_il"
[9] "total_bal_il" "il_util"
[11] "open_rv_12m" "open_rv_24m"
[13] "max_bal_bc" "all_util"
[15] "total_rev_hi_lim" "inq_fi"
[17] "total_cu_tl" "inq_last_12m"
Replace null values with 0 where is possible
lc_data <-
lc_data %>%
mutate(open_acc_6m = ifelse(is.na(open_acc_6m) == TRUE, 0, open_acc_6m)) %>%
mutate(tot_cur_bal = ifelse(is.na(tot_cur_bal) == TRUE, 0, tot_cur_bal)) %>%
mutate(open_il_6m = ifelse(is.na(open_il_6m) == TRUE, 0, open_il_6m)) %>%
mutate(open_il_12m = ifelse(is.na(open_il_12m) == TRUE, 0, open_il_12m)) %>%
mutate(open_il_24m = ifelse(is.na(open_il_24m) == TRUE, 0, open_il_24m)) %>%
mutate(mths_since_rcnt_il = ifelse(is.na(mths_since_rcnt_il) == TRUE, 0, mths_since_rcnt_il)) %>%
mutate(total_bal_il = ifelse(is.na(total_bal_il) == TRUE, 0, total_bal_il)) %>%
mutate(il_util = ifelse(is.na(il_util) == TRUE, 0, il_util)) %>%
mutate(open_rv_12m = ifelse(is.na(open_rv_12m) == TRUE, 0, open_rv_12m)) %>%
mutate(total_rev_hi_lim = ifelse(is.na(total_rev_hi_lim) == TRUE, 0, total_rev_hi_lim)) %>%
mutate(max_bal_bc = ifelse(is.na(max_bal_bc) == TRUE, 0, max_bal_bc)) %>%
mutate(all_util = ifelse(is.na(all_util) == TRUE, 0, all_util)) %>%
mutate(inq_fi = ifelse(is.na(inq_fi) == TRUE, 0, inq_fi)) %>%
mutate(total_cu_tl = ifelse(is.na(total_cu_tl) == TRUE, 0, total_cu_tl)) %>%
mutate(inq_last_12m = ifelse(is.na(inq_last_12m) == TRUE, 0, inq_last_12m)) %>%
mutate(open_rv_24m = ifelse(is.na(open_rv_24m) == TRUE, 0, open_rv_24m)) %>%
mutate(tot_coll_amt = ifelse(is.na(tot_coll_amt)== TRUE,0, tot_coll_amt)) %>%
mutate(collections_12_mths_ex_med = ifelse(is.na(collections_12_mths_ex_med)== TRUE,0, collections_12_mths_ex_med))
earliest_cr_line contains the month the borrower’s earliest reported credit line was opened. Even if this date consists only on month and year, still there are too many unique values. We could transform the dates in to a numerical value, by converting them from date into Unix Time. This unit measures time by the number of seconds that have elapsed since 00:00:00 UTC on 1 January 1970. Since this column doesn’t contain the day number, we take as a reference the first day of the month.
lc_data <- lc_data %>%
filter(!(is.na(earliest_cr_line)))
# function to replace dates with unix time
to_unix_time <- function(date) {
tmp <- paste("01", date, sep="-")
return (as.numeric(as.POSIXct(tmp, format="%d-%b-%Y", tz="UTC")))
}
# map dates to unix time
lc_data$earliest_cr_line <- apply(lc_data, 1, function(row) to_unix_time(row["earliest_cr_line"]))
# standardize them
#lc_data$earliest_cr_line <- scale(lc_data$earliest_cr_line)
Outliers Removal.
boxplot(lc_data$int_rate)
# Identify outliers using boxplot
outliers <- boxplot(lc_data$int_rate, plot = FALSE)$out
# Remove outliers from the dataset
lc_data_clean <- lc_data[!lc_data$int_rate %in% outliers, ]
summary(lc_data)
loan_amnt term int_rate emp_length
Min. : 500 Min. :1.0 Min. : 5.32 Min. : 1.00
1st Qu.: 8000 1st Qu.:1.0 1st Qu.: 9.99 1st Qu.: 3.00
Median :13000 Median :1.0 Median :12.99 Median : 4.00
Mean :14757 Mean :1.3 Mean :13.24 Mean : 5.11
3rd Qu.:20000 3rd Qu.:2.0 3rd Qu.:16.20 3rd Qu.: 7.00
Max. :35000 Max. :2.0 Max. :28.99 Max. :12.00
home_ownership verification_status purpose addr_state
Min. :1.000 Min. :1.000 Min. : 1.000 Min. : 1.00
1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 3.000 1st Qu.:10.00
Median :2.000 Median :2.000 Median : 3.000 Median :24.00
Mean :1.901 Mean :2.027 Mean : 3.571 Mean :24.14
3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 3.000 3rd Qu.:37.00
Max. :3.000 Max. :3.000 Max. :14.000 Max. :51.00
delinq_2yrs earliest_cr_line inq_last_6mths open_acc
Min. : 0.0000 Min. :-820540800 Min. : 0.0000 Min. : 1.00
1st Qu.: 0.0000 1st Qu.: 770428800 1st Qu.: 0.0000 1st Qu.: 8.00
Median : 0.0000 Median : 936144000 Median : 0.0000 Median :11.00
Mean : 0.3143 Mean : 889273164 Mean : 0.6947 Mean :11.55
3rd Qu.: 0.0000 3rd Qu.:1051747200 3rd Qu.: 1.0000 3rd Qu.:14.00
Max. :39.0000 Max. :1351728000 Max. :33.0000 Max. :90.00
pub_rec revol_bal revol_util total_acc
Min. : 0.0000 Min. : 0 Min. : 0.00 Min. : 1.00
1st Qu.: 0.0000 1st Qu.: 6450 1st Qu.: 37.70 1st Qu.: 17.00
Median : 0.0000 Median : 11881 Median : 56.00 Median : 24.00
Mean : 0.1954 Mean : 16934 Mean : 55.05 Mean : 25.27
3rd Qu.: 0.0000 3rd Qu.: 20844 3rd Qu.: 73.50 3rd Qu.: 32.00
Max. :63.0000 Max. :2904836 Max. :892.30 Max. :169.00
initial_list_status collections_12_mths_ex_med acc_now_delinq
Min. :1.000 Min. : 0.00000 Min. : 0.000000
1st Qu.:1.000 1st Qu.: 0.00000 1st Qu.: 0.000000
Median :1.000 Median : 0.00000 Median : 0.000000
Mean :1.485 Mean : 0.01448 Mean : 0.005026
3rd Qu.:2.000 3rd Qu.: 0.00000 3rd Qu.: 0.000000
Max. :2.000 Max. :20.00000 Max. :14.000000
tot_coll_amt tot_cur_bal open_acc_6m open_il_6m
Min. : 0 Min. : 0 Min. : 0.00000 Min. : 0.00000
1st Qu.: 0 1st Qu.: 23195 1st Qu.: 0.00000 1st Qu.: 0.00000
Median : 0 Median : 65402 Median : 0.00000 Median : 0.00000
Mean : 210 Mean : 128461 Mean : 0.02641 Mean : 0.06982
3rd Qu.: 0 3rd Qu.: 195864 3rd Qu.: 0.00000 3rd Qu.: 0.00000
Max. :9152545 Max. :8000078 Max. :14.00000 Max. :33.00000
open_il_12m open_il_24m mths_since_rcnt_il total_bal_il
Min. : 0.00000 Min. : 0.00000 Min. : 0.0000 Min. : 0
1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.: 0
Median : 0.00000 Median : 0.00000 Median : 0.0000 Median : 0
Mean : 0.01816 Mean : 0.03991 Mean : 0.4918 Mean : 872
3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.: 0
Max. :12.00000 Max. :19.00000 Max. :363.0000 Max. :878459
il_util open_rv_12m open_rv_24m max_bal_bc
Min. : 0.000 Min. : 0.00000 Min. : 0.00000 Min. : 0.0
1st Qu.: 0.000 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.0
Median : 0.000 Median : 0.00000 Median : 0.00000 Median : 0.0
Mean : 1.489 Mean : 0.03316 Mean : 0.07114 Mean : 140.8
3rd Qu.: 0.000 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.: 0.0
Max. :223.300 Max. :22.00000 Max. :43.00000 Max. :83047.0
all_util total_rev_hi_lim inq_fi total_cu_tl
Min. : 0.000 Min. : 0 Min. : 0.00000 Min. : 0.00000
1st Qu.: 0.000 1st Qu.: 11700 1st Qu.: 0.00000 1st Qu.: 0.00000
Median : 0.000 Median : 21800 Median : 0.00000 Median : 0.00000
Mean : 1.456 Mean : 29564 Mean : 0.02262 Mean : 0.03668
3rd Qu.: 0.000 3rd Qu.: 37900 3rd Qu.: 0.00000 3rd Qu.: 0.00000
Max. :151.400 Max. :9999999 Max. :16.00000 Max. :35.00000
inq_last_12m annual_inc_merged dti_merged mths_since_delinq_cat
Min. :-4.00000 Min. : 1896 Min. : 0.00 Min. :1.000
1st Qu.: 0.00000 1st Qu.: 45000 1st Qu.:11.91 1st Qu.:3.000
Median : 0.00000 Median : 65000 Median :17.66 Median :6.000
Mean : 0.04733 Mean : 75038 Mean :18.13 Mean :4.576
3rd Qu.: 0.00000 3rd Qu.: 90000 3rd Qu.:23.94 3rd Qu.:6.000
Max. :32.00000 Max. :9500000 Max. :43.86 Max. :6.000
mths_since_last_record_cat mths_since_last_major_derog_cat
Min. :1.000 Min. :1.000
1st Qu.:6.000 1st Qu.:6.000
Median :6.000 Median :6.000
Mean :5.779 Mean :5.435
3rd Qu.:6.000 3rd Qu.:6.000
Max. :6.000 Max. :6.000
# TODO: (parte vecchia), split 80/20 e linear regression...
# Create indices for splitting (80% train, 20% test)
train_indices <- createDataPartition(lc_data$int_rate, p = 0.8, list = FALSE)
# Create training and testing datasets
train_data <- lc_data[train_indices, ]
test_data <- lc_data[-train_indices, ]
#### Linear Regression ####
#lm.fit <- lm(int_rate ~ ., data = train_data)
# TODO: check collinearity and multicollinearity
#vif(lm.fit) # there is multicollinearity
#cor(lc_data)
# Make predictions on training and testing data
#train_predictions <- predict(lm.fit, newdata = train_data)
#test_predictions <- predict(lm.fit, newdata = test_data)
# Evaluate model performance on training data
#train_rmse <- sqrt(mean((train_predictions - train_data$int_rate)^2))
#train_r_squared <- summary(lm.fit)$r.squared
# Evaluate model performance on testing data
#test_rmse <- sqrt(mean((test_predictions - test_data$int_rate)^2))
#test_r_squared <- summary(lm.fit, test_data)$r.squared
# Print evaluation metrics
#cat("Training RMSE:", train_rmse, "\n")
#cat("Training R-squared:", train_r_squared, "\n")
#rmse <- sqrt(mean(lm.fit$residuals^2))
#print(rmse)
#### Linear Regression ####
lm.fit <- lm(int_rate ~ ., data = train_data)
# Make predictions on the training and testing data
lm.train_predictions <- predict(lm.fit, newdata = train_data)
lm.test_predictions <- predict(lm.fit, newdata = test_data)
# Calculate Mean Squared Error (MSE) for training and testing
lm.train_mse <- mean((lm.train_predictions - train_data$int_rate)^2)
lm.test_mse <- mean((lm.test_predictions - test_data$int_rate)^2)
# Calculate Root Mean Squared Error (RMSE) for training and testing
lm.train_rmse <- sqrt(lm.train_mse)
lm.test_rmse <- sqrt(lm.test_mse)
# Calculate Mean Absolute Error (MAE) for training and testing
lm.train_mae <- mean(abs(lm.train_predictions - train_data$int_rate))
lm.test_mae <- mean(abs(lm.test_predictions - test_data$int_rate))
# Calculate R-squared (R²) for training and testing
lm.train_r2 <- 1 - (sum((train_data$int_rate - lm.train_predictions)^2) / sum((train_data$int_rate - mean(train_data$int_rate))^2))
lm.test_r2 <- 1 - (sum((test_data$int_rate - lm.test_predictions)^2) / sum((test_data$int_rate - mean(test_data$int_rate))^2))
# Display the metrics
cat("Training MSE:", lm.train_mse, "\n")
Training MSE: 10.69433
cat("Testing MSE:", lm.test_mse, "\n")
Testing MSE: 11.08025
cat("Training RMSE:", lm.train_rmse, "\n")
Training RMSE: 3.270218
cat("Testing RMSE:", lm.test_rmse, "\n")
Testing RMSE: 3.328701
cat("Training MAE:", lm.train_mae, "\n")
Training MAE: 2.591549
cat("Testing MAE:", lm.test_mae, "\n")
Testing MAE: 2.592027
cat("Training R-squared (R²):", lm.train_r2, "\n")
Training R-squared (R²): 0.4431432
cat("Testing R-squared (R²):", lm.test_r2, "\n")
Testing R-squared (R²): 0.4227827
Lasso It standardizes data automatically
lasso.predictors_train <- model.matrix(int_rate ~ ., train_data)[,-1]
lasso.target_train <- train_data$int_rate
lasso.predictors_test <- model.matrix(int_rate ~ ., test_data)[,-1]
lasso.target_test <- test_data$int_rate
lasso.fit <- glmnet(lasso.predictors_train, lasso.target_train, alpha = 1)
dim(coef(lasso.fit))
[1] 41 70
# We have only 69 rows, because glmnet has a stop criterion, see help.
lasso.fit
Call: glmnet(x = lasso.predictors_train, y = lasso.target_train, alpha = 1)
Df %Dev Lambda
1 0 0.00 1.87700
2 1 3.11 1.71000
3 1 5.70 1.55800
4 1 7.85 1.42000
5 1 9.63 1.29400
6 1 11.11 1.17900
7 2 12.73 1.07400
8 3 15.01 0.97850
9 4 17.72 0.89160
10 4 20.19 0.81240
11 4 22.24 0.74020
12 6 24.54 0.67450
13 6 26.76 0.61450
14 7 28.69 0.55990
15 7 30.41 0.51020
16 9 32.15 0.46490
17 10 33.70 0.42360
18 10 35.04 0.38600
19 12 36.20 0.35170
20 13 37.34 0.32040
21 13 38.30 0.29200
22 14 39.11 0.26600
23 15 39.85 0.24240
24 15 40.47 0.22090
25 15 40.99 0.20120
26 16 41.42 0.18340
27 17 41.81 0.16710
28 17 42.13 0.15220
29 17 42.40 0.13870
30 17 42.62 0.12640
31 17 42.80 0.11520
32 19 42.97 0.10490
33 20 43.14 0.09560
34 20 43.28 0.08711
35 20 43.40 0.07937
36 21 43.52 0.07232
37 22 43.63 0.06590
38 23 43.72 0.06004
39 25 43.80 0.05471
40 27 43.88 0.04985
41 28 43.94 0.04542
42 31 44.00 0.04138
43 31 44.05 0.03771
44 31 44.09 0.03436
45 31 44.13 0.03131
46 31 44.16 0.02852
47 32 44.18 0.02599
48 33 44.21 0.02368
49 33 44.22 0.02158
50 33 44.24 0.01966
51 33 44.25 0.01791
52 33 44.26 0.01632
53 33 44.27 0.01487
54 35 44.28 0.01355
55 35 44.28 0.01235
56 35 44.29 0.01125
57 35 44.29 0.01025
58 36 44.30 0.00934
59 36 44.30 0.00851
60 36 44.30 0.00775
61 36 44.30 0.00707
62 36 44.30 0.00644
63 37 44.31 0.00587
64 37 44.31 0.00534
65 37 44.31 0.00487
66 37 44.31 0.00444
67 37 44.31 0.00404
68 37 44.31 0.00368
69 37 44.31 0.00336
70 37 44.31 0.00306
# We observe that some of the coefficients are set to zero (Df=0, %Dev=0)
plot(lasso.fit, label=TRUE)
# Also here we can see that some of the coefficients are set to zero.
# Make predictions on the training and testing data
lasso.train_predictions <- predict(lasso.fit, newdata = train_data, newx = lasso.predictors_train)
lasso.test_predictions <- predict(lasso.fit, newdata = test_data, newx = lasso.predictors_train)
# Calculate Mean Squared Error (MSE) for training and testing
lasso.train_mse <- mean((lasso.train_predictions - train_data$int_rate)^2)
lasso.test_mse <- mean((lasso.test_predictions - test_data$int_rate)^2)
Warning: longer object length is not a multiple of shorter object length
# Calculate Root Mean Squared Error (RMSE) for training and testing
lasso.train_rmse <- sqrt(lasso.train_mse)
lasso.test_rmse <- sqrt(lasso.test_mse)
# Calculate Mean Absolute Error (MAE) for training and testing
lasso.train_mae <- mean(abs(lasso.train_predictions - train_data$int_rate))
lasso.test_mae <- mean(abs(lasso.test_predictions - test_data$int_rate))
Warning: longer object length is not a multiple of shorter object length
# Calculate R-squared (R²) for training and testing
lasso.train_r2 <- 1 - (sum((train_data$int_rate - lasso.train_predictions)^2) / sum((train_data$int_rate - mean(train_data$int_rate))^2))
lasso.test_r2 <- 1 - (sum((test_data$int_rate - lasso.test_predictions)^2) / sum((test_data$int_rate - mean(test_data$int_rate))^2))
Warning: longer object length is not a multiple of shorter object length
# Display the metrics
cat("Training MSE:", lasso.train_mse, "\n")
Training MSE: 12.12513
cat("Testing MSE:", lasso.test_mse, "\n")
Testing MSE: 24.93444
cat("Training RMSE:", lasso.train_rmse, "\n")
Training RMSE: 3.482116
cat("Testing RMSE:", lasso.test_rmse, "\n")
Testing RMSE: 4.993439
cat("Training MAE:", lasso.train_mae, "\n")
Training MAE: 2.767999
cat("Testing MAE:", lasso.test_mae, "\n")
Testing MAE: 3.995769
cat("Training R-squared (R²):", lasso.train_r2, "\n")
Training R-squared (R²): -43.19514
cat("Testing R-squared (R²):", lasso.test_r2, "\n")
Testing R-squared (R²): -362.708
K fold using K=5:
# Define the number of folds for cross-validation
num_folds <- 5
folds <- createFolds(train_data$int_rate, k = num_folds, list = TRUE)
K fold using K=5 and linear regression:
#### Linear Regresion applying Cross Validation with k=5 ####
# Initialize lists to store models and their results
lm.k5.models <- list()
lm.k5.results <- data.frame()
# Perform k-fold cross-validation
for(i in seq_along(folds)) {
# Split the data into training and testing for the current fold
train_indices <- folds[[i]]
test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
train_data_fold <- train_data[train_indices, ]
test_data_fold <- train_data[test_indices, ]
# Fit the model on the training fold
lm.k5 <- lm(int_rate ~ ., data = train_data_fold)
lm.k5.models[[i]] <- lm.k5 # Store the model
# Make predictions on the training and testing fold
lm.k5.train_predictions <- predict(lm.k5, newdata = train_data_fold)
lm.k5.test_predictions <- predict(lm.k5, newdata = test_data_fold)
# Calculate metrics for training fold
lm.k5.train_mse <- mean((lm.k5.train_predictions - train_data_fold$int_rate)^2)
lm.k5.train_rmse <- sqrt(lm.k5.train_mse)
lm.k5.train_mae <- mean(abs(lm.k5.train_predictions - train_data_fold$int_rate))
lm.k5.train_r2 <- summary(lm.k5)$r.squared
# Calculate metrics for testing fold
lm.k5.test_mse <- mean((lm.k5.test_predictions - test_data_fold$int_rate)^2)
lm.k5.test_rmse <- sqrt(lm.k5.test_mse)
lm.k5.test_mae <- mean(abs(lm.k5.test_predictions - test_data_fold$int_rate))
lm.k5.test_r2 <- 1 - (sum((test_data_fold$int_rate - lm.k5.test_predictions)^2) / sum((test_data_fold$int_rate - mean(test_data_fold$int_rate))^2))
# Store metrics in the results dataframe
lm.k5.results <- rbind(lm.k5.results, data.frame(
Fold = i,
Train_MSE = lm.k5.train_mse, Test_MSE = lm.k5.test_mse,
Train_RMSE = lm.k5.train_rmse, Test_RMSE = lm.k5.test_rmse,
Train_MAE = lm.k5.train_mae, Test_MAE = lm.k5.test_mae,
Train_R2 = lm.k5.train_r2, Test_R2 = lm.k5.test_r2
))
}
# Display the models and their metrics
print(lm.k5.models)
[[1]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt
3.990e+00 3.224e-05
term emp_length
3.921e+00 1.270e-02
home_ownership verification_status
2.519e-01 7.589e-01
purpose addr_state
3.385e-01 4.820e-05
delinq_2yrs earliest_cr_line
3.545e-02 1.869e-09
inq_last_6mths open_acc
9.654e-01 6.661e-02
pub_rec revol_bal
3.458e-01 3.940e-06
revol_util total_acc
4.253e-02 -3.598e-02
initial_list_status collections_12_mths_ex_med
-1.015e+00 2.179e-01
acc_now_delinq tot_coll_amt
1.025e+00 2.255e-05
tot_cur_bal open_acc_6m
-1.205e-06 3.512e-04
open_il_6m open_il_12m
-1.427e-01 6.414e-01
open_il_24m mths_since_rcnt_il
-5.466e-02 -7.403e-03
total_bal_il il_util
4.214e-07 -1.237e-03
open_rv_12m open_rv_24m
1.658e-01 6.001e-02
max_bal_bc all_util
-4.794e-05 -5.778e-03
total_rev_hi_lim inq_fi
-1.649e-05 5.575e-02
total_cu_tl inq_last_12m
-4.534e-02 5.826e-02
annual_inc_merged dti_merged
-3.387e-06 5.134e-02
mths_since_delinq_cat mths_since_last_record_cat
-2.020e-01 -1.925e-01
mths_since_last_major_derog_cat
-1.434e-01
[[2]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt
3.686e+00 2.412e-05
term emp_length
3.944e+00 1.660e-02
home_ownership verification_status
2.446e-01 7.453e-01
purpose addr_state
3.437e-01 -4.542e-04
delinq_2yrs earliest_cr_line
3.112e-02 1.998e-09
inq_last_6mths open_acc
9.976e-01 4.858e-02
pub_rec revol_bal
3.926e-01 -6.629e-06
revol_util total_acc
4.792e-02 -3.290e-02
initial_list_status collections_12_mths_ex_med
-1.072e+00 3.118e-01
acc_now_delinq tot_coll_amt
1.259e+00 2.134e-05
tot_cur_bal open_acc_6m
-1.514e-06 1.265e-01
open_il_6m open_il_12m
-1.423e-01 6.040e-01
open_il_24m mths_since_rcnt_il
-4.183e-02 -6.549e-03
total_bal_il il_util
2.070e-06 -5.289e-03
open_rv_12m open_rv_24m
1.256e-01 5.293e-02
max_bal_bc all_util
-8.822e-05 1.001e-03
total_rev_hi_lim inq_fi
-3.157e-06 9.824e-02
total_cu_tl inq_last_12m
-8.761e-02 5.175e-02
annual_inc_merged dti_merged
-3.618e-06 5.119e-02
mths_since_delinq_cat mths_since_last_record_cat
-2.071e-01 -1.762e-01
mths_since_last_major_derog_cat
-1.537e-01
[[3]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt
3.844e+00 3.608e-05
term emp_length
3.867e+00 1.478e-02
home_ownership verification_status
2.625e-01 7.419e-01
purpose addr_state
3.363e-01 6.072e-04
delinq_2yrs earliest_cr_line
4.205e-02 1.888e-09
inq_last_6mths open_acc
9.894e-01 6.404e-02
pub_rec revol_bal
4.010e-01 5.549e-06
revol_util total_acc
4.195e-02 -3.446e-02
initial_list_status collections_12_mths_ex_med
-1.025e+00 3.267e-01
acc_now_delinq tot_coll_amt
1.465e+00 3.632e-05
tot_cur_bal open_acc_6m
-1.007e-06 -3.460e-03
open_il_6m open_il_12m
-1.426e-01 6.240e-01
open_il_24m mths_since_rcnt_il
-6.981e-02 -9.031e-03
total_bal_il il_util
1.895e-06 -5.360e-03
open_rv_12m open_rv_24m
2.909e-01 -2.050e-02
max_bal_bc all_util
-5.555e-05 -3.321e-04
total_rev_hi_lim inq_fi
-1.705e-05 -2.927e-03
total_cu_tl inq_last_12m
-3.094e-02 7.681e-02
annual_inc_merged dti_merged
-4.850e-06 5.052e-02
mths_since_delinq_cat mths_since_last_record_cat
-1.951e-01 -1.516e-01
mths_since_last_major_derog_cat
-1.509e-01
[[4]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt
3.731e+00 2.577e-05
term emp_length
3.911e+00 1.302e-02
home_ownership verification_status
2.586e-01 7.454e-01
purpose addr_state
3.438e-01 4.575e-05
delinq_2yrs earliest_cr_line
4.651e-02 1.988e-09
inq_last_6mths open_acc
1.000e+00 4.523e-02
pub_rec revol_bal
4.347e-01 -3.958e-06
revol_util total_acc
4.736e-02 -3.143e-02
initial_list_status collections_12_mths_ex_med
-1.091e+00 3.666e-01
acc_now_delinq tot_coll_amt
1.285e+00 3.192e-05
tot_cur_bal open_acc_6m
-1.527e-06 -7.028e-02
open_il_6m open_il_12m
-1.060e-01 7.431e-01
open_il_24m mths_since_rcnt_il
-8.725e-02 -1.108e-02
total_bal_il il_util
-2.061e-07 -5.789e-03
open_rv_12m open_rv_24m
2.228e-01 5.915e-02
max_bal_bc all_util
-7.615e-05 2.525e-03
total_rev_hi_lim inq_fi
-3.420e-06 6.850e-02
total_cu_tl inq_last_12m
-8.391e-02 4.464e-02
annual_inc_merged dti_merged
-4.028e-06 5.019e-02
mths_since_delinq_cat mths_since_last_record_cat
-2.029e-01 -1.855e-01
mths_since_last_major_derog_cat
-1.429e-01
[[5]]
Call:
lm(formula = int_rate ~ ., data = train_data_fold)
Coefficients:
(Intercept) loan_amnt
3.673e+00 3.206e-05
term emp_length
3.892e+00 1.744e-02
home_ownership verification_status
2.754e-01 7.294e-01
purpose addr_state
3.328e-01 6.610e-04
delinq_2yrs earliest_cr_line
2.837e-02 1.893e-09
inq_last_6mths open_acc
9.921e-01 6.424e-02
pub_rec revol_bal
4.228e-01 7.145e-06
revol_util total_acc
4.255e-02 -3.639e-02
initial_list_status collections_12_mths_ex_med
-9.944e-01 2.516e-01
acc_now_delinq tot_coll_amt
1.367e+00 2.477e-05
tot_cur_bal open_acc_6m
-1.040e-06 8.671e-02
open_il_6m open_il_12m
-1.448e-01 5.623e-01
open_il_24m mths_since_rcnt_il
-2.528e-02 -4.976e-03
total_bal_il il_util
3.319e-06 -3.528e-03
open_rv_12m open_rv_24m
1.595e-01 6.265e-02
max_bal_bc all_util
-7.606e-05 -2.039e-03
total_rev_hi_lim inq_fi
-1.711e-05 -3.026e-02
total_cu_tl inq_last_12m
-6.547e-02 8.003e-02
annual_inc_merged dti_merged
-3.475e-06 5.239e-02
mths_since_delinq_cat mths_since_last_record_cat
-2.008e-01 -1.553e-01
mths_since_last_major_derog_cat
-1.422e-01
print(lm.k5.results)
long_data <- pivot_longer(lm.k5.results,
cols = -Fold, # Exclude Fold from reshaping
names_to = "Metric",
values_to = "Value")
# Check the reshaped data
print(head(long_data))
# Plot for Training Metrics
ggplot(long_data_training, aes(x = factor(Fold), y = Value, color = Metric)) +
geom_point() +
geom_line() + # Use this only if it makes sense to connect points across folds
labs(title = "Training Metrics per Fold - Linear Regression",
x = "Fold",
y = "Metric Value") +
theme_minimal() +
scale_color_discrete(name = "Training Metrics")
ggplot(subset(long_data, grepl("Train", Metric)), aes(x = factor(Fold), y = Value, color = Metric)) +
geom_point() +
geom_line() + # Use this only if it makes sense to connect points across folds
labs(title = "Training Metrics per Fold - Linear Regression",
x = "Fold",
y = "Metric Value") +
theme_minimal() +
scale_color_discrete(name = "Training Metrics")
# Plot for Testing Metrics
ggplot(long_data_testing, aes(x = factor(Fold), y = Value, color = Metric)) +
geom_point() +
geom_line() + # Use this only if it makes sense to connect points across folds
labs(title = "Testing Metrics per Fold - Linear Regression",
x = "Fold",
y = "Metric Value") +
theme_minimal() +
scale_color_discrete(name = "Testing Metrics")
K fold using K=5 and Random Forest:
#### Random Forest applying Cross Validation with k5 ####
# Initialize lists to store models and their results
rf.k5.models <- list()
rf.k5.results <- data.frame()
# Perform k-fold cross-validation
for(i in seq_along(folds)) {
# Split the data into training and testing for the current fold
train_indices <- folds[[i]]
test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
train_data_fold <- train_data[train_indices, ]
test_data_fold <- train_data[test_indices, ]
# Fit the model on the training fold
rf.k5 <- ranger(formula = int_rate ~ ., data = train_data, num.trees = 500, verbose=TRUE, importance = "impurity", oob.error = TRUE)
rf.k5.models[[i]] <- rf.k5 # Store the model
# Make predictions on the training and testing fold
rf.k5.train_predictions <- predict(rf.k5, data = train_data_fold)
rf.k5.test_predictions <- predict(rf.k5, data = test_data_fold)
# Calculate metrics for training fold
rf.k5.train_mse <- mean((rf.k5.train_predictions - train_data_fold$int_rate)^2)
rf.k5.train_rmse <- sqrt(rf.k5.train_mse)
rf.k5.train_mae <- mean(abs(rf.k5.train_predictions - train_data_fold$int_rate))
rf.k5.train_r2 <- summary(rf.k5)$r.squared
# Calculate metrics for testing fold
rf.k5.test_mse <- mean((rf.k5.test_predictions - test_data_fold$int_rate)^2)
rf.k5.test_rmse <- sqrt(rf.k5.test_mse)
rf.k5.test_mae <- mean(abs(rf.k5.test_predictions - test_data_fold$int_rate))
rf.k5.test_r2 <- 1 - (sum((test_data_fold$int_rate - rf.k5.test_predictions)^2) / sum((test_data_fold$int_rate - mean(test_data_fold$int_rate))^2))
# Store metrics in the results dataframe
rf.k5.results <- rbind(rf.k5.results, data.frame(
Fold = i,
Train_MSE = rf.k5.train_mse, Test_MSE = rf.k5.test_mse,
Train_RMSE = rf.k5.train_rmse, Test_RMSE = rf.k5.test_rmse,
Train_MAE = rf.k5.train_mae, Test_MAE = rf.k5.test_mae,
Train_R2 = rf.k5.train_r2, Test_R2 = rf.k5.test_r2
))
}
Growing trees.. Progress: 12%. Estimated remaining time: 3 minutes, 47 seconds.
Growing trees.. Progress: 25%. Estimated remaining time: 3 minutes, 8 seconds.
Growing trees.. Progress: 37%. Estimated remaining time: 2 minutes, 40 seconds.
Growing trees.. Progress: 50%. Estimated remaining time: 2 minutes, 5 seconds.
Growing trees.. Progress: 63%. Estimated remaining time: 1 minute, 31 seconds.
Growing trees.. Progress: 77%. Estimated remaining time: 56 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 24 seconds.
Error: vector memory exhausted (limit reached?)
K fold using K=5 and Boosting:
#### Boosting applying Cross Validation with k=5 ####
# Initialize lists to store models and their results
xgb.k5.models <- list()
xgb.k5.results <- data.frame()
# Perform k-fold cross-validation
for(i in seq_along(folds)) {
# Split the data into training and testing for the current fold
train_indices <- folds[[i]]
test_indices <- setdiff(seq_len(nrow(train_data)), train_indices)
train_data_fold <- train_data[train_indices, ]
test_data_fold <- train_data[test_indices, ]
# Prepare data for xgboost
xgb.y_train_fold <- train_data_fold$int_rate
xgb.X_train_fold <- as.matrix(train_data_fold[, -which(names(train_data_fold) == 'int_rate')])
xgb.y_test_fold <- test_data_fold$int_rate
xgb.X_test_fold <- as.matrix(test_data_fold[, -which(names(test_data_fold) == 'int_rate')])
# Fit the xgboost model on the training fold
xgb.k5 <- xgboost(
data = xgb.X_train_fold,
label = xgb.y_train_fold,
nrounds = 100,
verbose = 0
)
xgb.k5.models[[i]] <- xgb.k5 # Store the model
# Make predictions on the training fold
xgb.k5.train_predictions <- predict(xgb.k5, newdata = xgb.X_train_fold)
# Make predictions on the testing fold
xgb.k5.test_predictions <- predict(xgb.k5, newdata = xgb.X_test_fold)
# Calculate metrics for training fold
xgb.k5.train_mse <- mean((xgb.k5.train_predictions - train_data_fold$int_rate)^2)
xgb.k5.train_rmse <- sqrt(xgb.k5.train_mse)
xgb.k5.train_mae <- mean(abs(xgb.k5.train_predictions - train_data_fold$int_rate))
xgb.k5.train_r2 <- 1 - (sum((xgb.y_train_fold - xgb.k5.train_predictions)^2) / sum((xgb.y_train_fold - mean(xgb.y_train_fold))^2))
# Calculate metrics for testing fold
xgb.k5.test_mse <- mean((xgb.k5.test_predictions - xgb.y_test_fold)^2)
xgb.k5.test_rmse <- sqrt(xgb.k5.test_mse)
xgb.k5.test_mae <- mean(abs(xgb.k5.test_predictions - xgb.y_test_fold))
xgb.k5.test_r2 <- 1 - (sum((xgb.y_test_fold - xgb.k5.test_predictions)^2) / sum((xgb.y_test_fold - mean(xgb.y_test_fold))^2))
# Store metrics in the results dataframe
xgb.k5.results <- rbind(xgb.k5.results, data.frame(
Fold = i,
Train_MSE = xgb.k5.train_mse, Test_MSE = xgb.k5.test_mse,
Train_RMSE = xgb.k5.train_rmse, Test_RMSE = xgb.k5.test_rmse,
Train_MAE = xgb.k5.train_mae, Test_MAE = xgb.k5.test_mae,
Train_R2 = xgb.k5.train_r2, Test_R2 = xgb.k5.test_r2
))
}
# Display the models and their metrics
print(xgb.k5.models)
[[1]]
##### xgb.Booster
raw: 445.5 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 40
niter: 100
nfeatures : 40
evaluation_log:
[[2]]
##### xgb.Booster
raw: 458.1 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 40
niter: 100
nfeatures : 40
evaluation_log:
[[3]]
##### xgb.Booster
raw: 445.7 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 40
niter: 100
nfeatures : 40
evaluation_log:
[[4]]
##### xgb.Booster
raw: 454.4 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 40
niter: 100
nfeatures : 40
evaluation_log:
[[5]]
##### xgb.Booster
raw: 454.7 Kb
call:
xgb.train(params = params, data = dtrain, nrounds = nrounds,
watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, xgb_model = xgb_model,
callbacks = callbacks)
params (as set within xgb.train):
validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 40
niter: 100
nfeatures : 40
evaluation_log:
print(xgb.k5.results)
Decision Tree
#### Decision Trees ####
# Error in tree: "factor predictors must have at most 32 levels" is thrown.
# Basically, it becomes computationally expensive to create so many splits in your data, since you are selecting the best split out of all 2^32 (approx) possible splits.
# Fit a decision tree model on the training data
#tm <- tree(int_rate ~ ., data = train_data)
# Make predictions on the training and testing data
#tm.train_predictions <- predict(tm, newdata = train_data)
#tm.test_predictions <- predict(tm, newdata = test_data)
# Calculate Mean Squared Error (MSE) for training and testing
#tm.train_mse <- mean((tm.train_predictions - train_data$int_rate)^2)
#tm.test_mse <- mean((tm.test_predictions - test_data$int_rate)^2)
# Calculate Root Mean Squared Error (RMSE) for training and testing
#tm.train_rmse <- sqrt(tm.train_mse)
#tm.test_rmse <- sqrt(tm.test_mse)
# Calculate Mean Absolute Error (MAE) for training and testing
#tm.train_mae <- mean(abs(tm.train_predictions - train_data$int_rate))
#tm.test_mae <- mean(abs(tm.test_predictions - test_data$int_rate))
# Calculate R-squared (R²) for training and testing
#tm.train_r2 <- 1 - (sum((train_data$int_rate - tm.train_predictions)^2) / sum((train_data$int_rate - mean(train_data$int_rate))^2))
#tm.test_r2 <- 1 - (sum((test_data$int_rate - tm.test_predictions)^2) / sum((test_data$int_rate - mean(test_data$int_rate))^2))
# Display the metrics
#cat("Training MSE:", tm.train_mse, "\n")
#cat("Testing MSE:", tm.test_mse, "\n")
#cat("Training RMSE:", tm.train_rmse, "\n")
#cat("Testing RMSE:", tm.test_rmse, "\n")
#cat("Training MAE:", tm.train_mae, "\n")
#cat("Testing MAE:", tm.test_mae, "\n")
#cat("Training R-squared (R²):", tm.train_r2, "\n")
#cat("Testing R-squared (R²):", tm.test_r2, "\n")
Random Forest
#### Random Forest ####
# Train a Random Forest model
rf <- ranger(formula = int_rate ~ ., data = train_data, num.trees = 500, verbose=TRUE, importance = "impurity", oob.error = TRUE)
Growing trees.. Progress: 11%. Estimated remaining time: 4 minutes, 8 seconds.
Growing trees.. Progress: 24%. Estimated remaining time: 3 minutes, 20 seconds.
Growing trees.. Progress: 37%. Estimated remaining time: 2 minutes, 41 seconds.
Growing trees.. Progress: 50%. Estimated remaining time: 2 minutes, 8 seconds.
Growing trees.. Progress: 63%. Estimated remaining time: 1 minute, 34 seconds.
Growing trees.. Progress: 75%. Estimated remaining time: 1 minute, 2 seconds.
Growing trees.. Progress: 88%. Estimated remaining time: 30 seconds.
# Print the model summary
print("Random Forest Model Summary:")
[1] "Random Forest Model Summary:"
print(rf)
Ranger result
Call:
ranger(formula = int_rate ~ ., data = train_data, num.trees = 500, verbose = TRUE, importance = "impurity", oob.error = TRUE)
Type: Regression
Number of trees: 500
Sample size: 638392
Number of independent variables: 40
Mtry: 6
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 8.744647
R squared (OOB): 0.5446644
# Make predictions on the training and testing data
rf.train_predictions <- predict(rf, data = train_data)
Predicting.. Progress: 81%. Estimated remaining time: 7 seconds.
rf.test_predictions <- predict(rf, data = test_data)
# Calculate Mean Squared Error (MSE) for training and testing
rf.train_mse <- mean((rf.train_predictions$predictions - train_data$int_rate)^2)
rf.test_mse <- mean((rf.test_predictions$predictions - test_data$int_rate)^2)
# Calculate Root Mean Squared Error (RMSE) for training and testing
rf.train_rmse <- sqrt(rf.train_mse)
rf.test_rmse <- sqrt(rf.test_mse)
# Calculate Mean Absolute Error (MAE) for training and testing
rf.train_mae <- mean(abs(rf.train_predictions$predictions - train_data$int_rate))
rf.test_mae <- mean(abs(rf.test_predictions$predictions - test_data$int_rate))
# Calculate R-squared (R²) for training and testing
rf.train_r2 <- 1 - (sum((train_data$int_rate - rf.train_predictions$predictions)^2) / sum((train_data$int_rate - mean(train_data$int_rate))^2))
rf.test_r2 <- 1 - (sum((test_data$int_rate - rf.test_predictions$predictions)^2) / sum((test_data$int_rate - mean(test_data$int_rate))^2))
# Display the metrics
cat("Training MSE:", rf.train_mse, "\n")
Training MSE: 2.066856
cat("Testing MSE:", rf.test_mse, "\n")
Testing MSE: 8.69323
cat("Training RMSE:", rf.train_rmse, "\n")
Training RMSE: 1.437656
cat("Testing RMSE:", rf.test_rmse, "\n")
Testing RMSE: 2.948428
cat("Training MAE:", rf.train_mae, "\n")
Training MAE: 1.130191
cat("Testing MAE:", rf.test_mae, "\n")
Testing MAE: 2.331673
cat("Training R-squared (R²):", rf.train_r2, "\n")
Training R-squared (R²): 0.8923782
cat("Testing R-squared (R²):", rf.test_r2, "\n")
Testing R-squared (R²): 0.5471326
#rf <- randomForest(int_rate~., data=train_data, ntree = 5, mtry = 3)
#bag.boston=randomForest(medv~.,data=Boston,subset=train, mtry=13,importance =TRUE)
#print(rf)
# Set the number of cores you want to use
#num_cores <- 6 # Adjust this number based on your system's capabilities
# Register parallel backend
#cl <- makeCluster(num_cores)
#registerDoParallel(cl)
# Assuming 'lc_data' is your dataset
#rf_model <- foreach(ntree = rep(100, num_cores), .packages = 'randomForest') %dopar% {
# randomForest(int_rate ~ ., data = lc_data, ntree = ntree, mtry = sqrt(ncol(lc_data)))
#}
# After training, stop the cluster to release the cores:
#stopCluster(cl)
Boosting
#### Boosting ####
# Define the target variable for training and testing
xgb.y_train <- train_data$int_rate
xgb.y_test <- test_data$int_rate # Use test_data for testing
# Define the feature matrix for training and testing (exclude the target variable)
xgb.X_train <- train_data[, -which(names(train_data) == 'int_rate')]
xgb.X_test <- test_data[, -which(names(test_data) == 'int_rate')] # Use test_data for testing
# Fit a gradient boosting regression model using xgboost
xgb <- xgboost(
data = as.matrix(xgb.X_train),
label = xgb.y_train,
nrounds = 100,
verbose = 0
)
# Make predictions on the training and testing data
xgb.train_predictions <- predict(xgb, newdata = as.matrix(xgb.X_train))
xgb.test_predictions <- predict(xgb, newdata = as.matrix(xgb.X_test))
# Calculate Mean Squared Error (MSE) for training and testing
xgb.train_mse <- mean((xgb.train_predictions - xgb.y_train)^2)
xgb.test_mse <- mean((xgb.test_predictions - xgb.y_test)^2)
# Calculate Root Mean Squared Error (RMSE) for training and testing
xgb.train_rmse <- sqrt(xgb.train_mse)
xgb.test_rmse <- sqrt(xgb.test_mse)
# Calculate Mean Absolute Error (MAE) for training and testing
xgb.train_mae <- mean(abs(xgb.train_predictions - xgb.y_train))
xgb.test_mae <- mean(abs(xgb.test_predictions - xgb.y_test))
# Calculate R-squared (R²) for training and testing
xgb.train_r2 <- 1 - (sum((xgb.y_train - xgb.train_predictions)^2) / sum((xgb.y_train - mean(xgb.y_train))^2))
xgb.test_r2 <- 1 - (sum((xgb.y_test - xgb.test_predictions)^2) / sum((xgb.y_test - mean(xgb.y_test))^2))
# Display the metrics
cat("Training MSE:", xgb.train_mse, "\n")
Training MSE: 7.70773
cat("Testing MSE:", xgb.test_mse, "\n")
Testing MSE: 8.007011
cat("Training RMSE:", xgb.train_rmse, "\n")
Training RMSE: 2.77628
cat("Testing RMSE:", xgb.test_rmse, "\n")
Testing RMSE: 2.829666
cat("Training MAE:", xgb.train_mae, "\n")
Training MAE: 2.179231
cat("Testing MAE:", xgb.test_mae, "\n")
Testing MAE: 2.219719
cat("Training R-squared (R²):", xgb.train_r2, "\n")
Training R-squared (R²): 0.5986562
cat("Testing R-squared (R²):", xgb.test_r2, "\n")
Testing R-squared (R²): 0.5828808
Following, a scatter plot of actual vs predicted training values for each model is plot. This plot helps us visualize how well each model’s predictions align with the actual data points.
# Create a scatter plot function
create_scatter_plot <- function(actual_values, predicted_values, model_name) {
model_comparison_data <- data.frame(
Actual = actual_values,
Predicted = predicted_values
)
scatter_plot <- ggplot(model_comparison_data, aes(x = Actual, y = Predicted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + # Add a diagonal reference line
labs(x = "Actual Training Values", y = "Predicted Training Values", title = model_name) +
theme_minimal() +
ylim(-50, 50)
return(scatter_plot)
}
# Create scatter plots for each model
lm_scatter_plot <- create_scatter_plot(
actual_values = train_data$int_rate,
predicted_values = lm.train_predictions,
model_name = "Linear Regression"
)
rf_scatter_plot <- create_scatter_plot(
actual_values = train_data$int_rate,
predicted_values = rf.train_predictions$predictions,
model_name = "Random Forest"
)
xgb_scatter_plot <- create_scatter_plot(
actual_values = xgb.y_train,
predicted_values = xgb.train_predictions,
model_name = "XGBoost"
)
# Display the scatter plots separately
print(lm_scatter_plot)
print(rf_scatter_plot)
print(xgb_scatter_plot)
Following, a scatter plot of actual vs predicted testing values for each model is plot. This plot helps us visualize how well each model’s predictions align with the actual data points.
# Create a scatter plot function
create_scatter_plot <- function(actual_values, predicted_values, model_name) {
model_comparison_data <- data.frame(
Actual = actual_values,
Predicted = predicted_values
)
scatter_plot <- ggplot(model_comparison_data, aes(x = Actual, y = Predicted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + # Add a diagonal reference line
labs(x = "Actual Testing Values", y = "Predicted Testing Values", title = model_name) +
theme_minimal() +
ylim(-50, 50) +
xlim(0, 40)
return(scatter_plot)
}
# Create scatter plots for each model
lm_scatter_plot <- create_scatter_plot(
actual_values = test_data$int_rate,
predicted_values = lm.test_predictions,
model_name = "Linear Regression"
)
rf_scatter_plot <- create_scatter_plot(
actual_values = test_data$int_rate,
predicted_values = rf.test_predictions$predictions,
model_name = "Random Forest"
)
xgb_scatter_plot <- create_scatter_plot(
actual_values = xgb.y_test,
predicted_values = xgb.test_predictions,
model_name = "XGBoost"
)
# Display the scatter plots separately
print(lm_scatter_plot)
print(rf_scatter_plot)
print(xgb_scatter_plot)
Residual plots can help identify patterns in prediction errors and assess whether the assumptions of linear regression (if applicable) are met.
# Create a residual plot function
create_residual_plot <- function(actual_values, predicted_values, model_name) {
residuals <- actual_values - predicted_values
residual_data <- data.frame(
Predicted = predicted_values,
Residuals = residuals
)
residual_plot <- ggplot(residual_data, aes(x = Predicted, y = Residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") + # Red horizontal reference line
labs(x = "Predicted Values", y = "Residuals", title = paste("Residual Plot -", model_name)) +
theme_minimal() +
ylim(-30, 30) +
xlim(0, 40)
return(residual_plot)
}
# Create residual plots for each model
lm_residual_plot <- create_residual_plot(
actual_values = train_data$int_rate,
predicted_values = lm.train_predictions,
model_name = "Linear Regression"
)
rf_residual_plot <- create_residual_plot(
actual_values = train_data$int_rate,
predicted_values = rf.train_predictions$predictions,
model_name = "Random Forest"
)
xgb_residual_plot <- create_residual_plot(
actual_values = xgb.y_train,
predicted_values = xgb.train_predictions,
model_name = "XGBoost"
)
# Display the residual plots separately
print(lm_residual_plot)
print(rf_residual_plot)
print(xgb_residual_plot)
From the plots above we can clearly see that:
This visualization can help you compare the distribution of prediction errors across models.
# Create a density plot function for residuals
create_residual_density_plot <- function(actual_values, predicted_values, model_name) {
residuals <- actual_values - predicted_values
residual_data <- data.frame(Residuals = residuals)
density_plot <- ggplot(residual_data, aes(x = Residuals)) +
geom_density(fill = "skyblue", color = "black", alpha = 0.7) +
labs(x = "Residuals", y = "Density", title = paste("Residual Density Plot -", model_name)) +
theme_minimal() +
xlim(-30,30) +
ylim(0, 0.35)
return(density_plot)
}
# Create density plots for residuals for each model
lm_residual_density_plot <- create_residual_density_plot(
actual_values = train_data$int_rate,
predicted_values = lm.train_predictions,
model_name = "Linear Regression"
)
rf_residual_density_plot <- create_residual_density_plot(
actual_values = train_data$int_rate,
predicted_values = rf.train_predictions$predictions,
model_name = "Random Forest"
)
xgb_residual_density_plot <- create_residual_density_plot(
actual_values = xgb.y_train,
predicted_values = xgb.train_predictions,
model_name = "XGBoost"
)
# Display the density plots separately
print(lm_residual_density_plot)
print(rf_residual_density_plot)
print(xgb_residual_density_plot)
This visualization can help you compare the distribution of prediction errors across models through histograms.
# Create a histogram plot function for residuals with a red density curve
create_residual_histogram_plot <- function(actual_values, predicted_values, model_name) {
residuals <- actual_values - predicted_values
residual_data <- data.frame(Residuals = residuals)
histogram_plot <- ggplot(residual_data, aes(x = Residuals)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) + # Use density on the y-axis for the histogram
geom_density(color = "red", linewidth = 1.5) + # Add the density plot in red
labs(x = "Residuals", y = "Density", title = paste("Residual Histogram Plot with Density Curve -", model_name)) +
theme_minimal() +
xlim(-20,20) +
ylim(0, 0.3)
return(histogram_plot)
}
# Create histogram plots for residuals for each model
lm_residual_histogram_plot <- create_residual_histogram_plot(
actual_values = train_data$int_rate,
predicted_values = lm.train_predictions,
model_name = "Linear Regression"
)
rf_residual_histogram_plot <- create_residual_histogram_plot(
actual_values = train_data$int_rate,
predicted_values = rf.train_predictions$predictions,
model_name = "Random Forest"
)
xgb_residual_histogram_plot <- create_residual_histogram_plot(
actual_values = xgb.y_train,
predicted_values = xgb.train_predictions,
model_name = "XGBoost"
)
# Display the histogram plots separately
print(lm_residual_histogram_plot)
print(rf_residual_histogram_plot)
print(xgb_residual_histogram_plot)
For each model a bar chart that displays the R-squared (coefficient of determination) values is created. R-squared measures the proportion of variance in the target variable explained by the model. Higher R-squared values indicate better model fit.
# Create a data frame with R-squared values for each model
model_names <- c("Linear Regression", "Random Forest", "XGBoost")
r_squared_values_train <- c(
lm.train_r2,
rf.train_r2,
xgb.train_r2
)
r_squared_values_test <- c(
lm.test_r2,
rf.test_r2,
xgb.test_r2
)
r_squared_data_train <- data.frame(Model = factor(model_names),
R_squared = r_squared_values_train)
r_squared_data_test <- data.frame(Model = factor(model_names),
R_squared = r_squared_values_test)
# Create the R-squared comparison bar chart
r_squared_bar_chart_train <- ggplot(r_squared_data_train, aes(x = Model, y = R_squared, fill = Model)) +
geom_bar(stat = "identity") +
labs(x = "Model", y = "R-squared (R²)", title = "R-squared Comparison Training") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0,1)
r_squared_bar_chart_test <- ggplot(r_squared_data_test, aes(x = Model, y = R_squared, fill = Model)) +
geom_bar(stat = "identity") +
labs(x = "Model", y = "R-squared (R²)", title = "R-squared Comparison Testing") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0,1)
# Display the R-squared comparison bar chart
print(r_squared_bar_chart_train)
print(r_squared_bar_chart_test)
A bar chart that compares the MAE or RMSE values, is generated for each model. These metrics quantify the average prediction errors of each model, and lower values are preferred.
# Create a data frame with MAE and RMSE values for each model
model_names <- c("Linear Regression", "Random Forest", "XGBoost","Linear Regression", "Random Forest", "XGBoost")
error_values_train <- c(
lm.train_mae,
rf.train_mae,
xgb.train_mae,
lm.train_rmse,
rf.train_rmse,
xgb.train_rmse
)
error_values_test <- c(
lm.test_mae,
rf.test_mae,
xgb.test_mae,
lm.test_rmse,
rf.test_rmse,
xgb.test_rmse
)
error_type <- c(
"MAE", "MAE", "MAE","RMSE","RMSE","RMSE"
)
model_errors_train <- data.frame(Model = factor(model_names, levels = c("Linear Regression", "Random Forest", "XGBoost")),
Error = error_values_train, Type = error_type)
model_errors_test <- data.frame(Model = factor(model_names, levels = c("Linear Regression", "Random Forest", "XGBoost")),
Error = error_values_test, Type = error_type)
# Create the MAE or RMSE comparison bar chart
error_bar_chart_train <- ggplot(model_errors_train, aes(x = Model, y = Error, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Model", y = "Error Value", title = "Training MAE and RMSE Comparison") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0, 4)
error_bar_chart_test <- ggplot(model_errors_test, aes(x = Model, y = Error, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Model", y = "Error Value", title = "Testing MAE and RMSE Comparison") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0, 4)
# Display the MAE and RMSE comparison bar chart
print(error_bar_chart_train)
print(error_bar_chart_test)
#### Random Forest Feature Importance Plot ####
v1 <- vip(rf, title = "Ranger", num_features = 20)
plot(v1)
Learning curve using RMSE and R^2:
# TODO: change the x-axes
# Create a data frame with RMSE and R-squared values for each model and sample size
model_names <- c("Linear Regression", "Random Forest", "XGBoost")
sample_sizes <- seq(10, nrow(train_data), by = 10) # Adjust the sample sizes as needed
# Create data frames with RMSE and R-squared values for each model
rmse_data <- data.frame(
Model = rep(model_names, each = length(sample_sizes)),
Sample_Size = rep(sample_sizes, times = length(model_names)),
RMSE = c(
lm.train_rmse, rf.train_rmse, xgb.train_rmse
)
)
r_squared_data <- data.frame(
Model = rep(model_names, each = length(sample_sizes)),
Sample_Size = rep(sample_sizes, times = length(model_names)),
R_squared = c(
lm.train_r2, rf.train_r2, xgb.train_r2
)
)
# Create RMSE learning curve
rmse_curve <- ggplot(rmse_data, aes(x = Sample_Size, y = RMSE, color = Model)) +
geom_line() +
labs(x = "Sample Size", y = "RMSE", title = "RMSE Learning Curve") +
theme_minimal()
# Create R-squared learning curve
r_squared_curve <- ggplot(r_squared_data, aes(x = Sample_Size, y = R_squared, color = Model)) +
geom_line() +
labs(x = "Sample Size", y = "R-squared", title = "R-squared Learning Curve") +
theme_minimal()
# Display the RMSE and R-squared learning curves
print(rmse_curve)
print(r_squared_curve)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.